

****************************************
*******I. CLEAN RAW DATA FILES************
****************************************


/* Clean awards file to get funding source */

use "$rawdata\core_award2019q1.dta", clear
ren institution_id submit_university
drop if recipient==""&(unique_award==""|unique_award=="#N/A")
replace overhead=0 if overhead<0
replace total_direct=0 if total_direct<0
drop funding_source_name_c
gsort period_start period_end unique_award submit_university funding_source recipient -sub_org
collapse (sum) overhead total_direct (first) sub_org, by (period_start period_end unique_award submit_university funding_source recipient)
*if an award has multiple source in a period, keep the source with the highest amount
bysort submit_university unique_award recipient period_start period_end: gen count=_N
drop if count>1&funding_source==""
gsort submit_university unique_award recipient period_start period_end -total_direct
by submit_university unique_award recipient period_start period_end: gen order=_n 
keep if order==1
gen byte flag_multiple_source=(count>1)
drop order count

bysort submit_university unique_award recipient: gen count1=_N
bysort submit_university unique_award recipient funding_source: gen count2=_N
gen byte singlesource_recipient= (count1==count2)
bysort submit_university unique_award: gen count3=_N
bysort submit_university unique_award funding_source: gen count4=_N
gen byte singlesource_award= (count3==count4)
drop count*
save "$temp\funding_source", replace

use "$temp\funding_source", clear
keep if singlesource_recipient==1
keep submit_university unique_award recipient funding_source
duplicates drop
ren funding_source fundingsource2
save "$temp\funding_source_recipient", replace

use "$temp\funding_source", clear
keep if singlesource_award==1
keep submit_university unique_award funding_source
duplicates drop
ren funding_source fundingsource3
save "$temp\funding_source_award", replace


use "$temp\funding_source", clear
gsort submit_u unique -sub_org
collapse (first) sub_org , by (submit unique)
save "$temp\award_field", replace

/* Clean employee file */

use "$rawdata\core_employee2019q1.dta", clear
ren institution_id submit_university
ren emp_number new_emp
drop object_code proportion job_title soc_code fte occupation fed_awd umetrics_occ
duplicates drop

*get funding source information from awards data
merge m:1 submit_university unique_award recipient period_start period_end/*
*/ using "$temp\funding_source", keep(1 3) nogen
merge m:1 submit_university unique_award recipient /*
*/ using "$temp\funding_source_recipient", keep(1 3) nogen
merge m:1 submit_university unique_award /*
*/ using "$temp\funding_source_award", keep(1 3) nogen
replace funding_source=fundingsource2 if funding_source==""
replace funding_source=fundingsource3 if funding_source==""
drop overhead-fundingsource3

merge m:1 cfda using "$crosswalk\cfda- agency.dta",/*
*/ keep(1 3) keepus(agency agency_abb program_title) nogen

keep period* unique_award submit_u cfda funding_source agency* new_emp
duplicates drop
compress

gen year_start=substr(period_start,1,4)
gen year_end=substr(period_end,1,4)
destring year_start year_end, replace

unique submit unique_award
unique submit new_emp
unique submit new_emp year_end
unique submit new_emp unique_award period_start period_end

drop if funding_source==""&cfda==""&unique=="#N/A"
compress

gen byte federal_clear=(cfda!=""&cfda!="00.000"&cfda!="99.999"&agency!=0&agency!=.&agency_a!="UNKNOWN")

gen funding_type="STATE" if substr(agency_ab,1,5)=="STATE"
replace funding_type="FEDERAL" if federal_clear==1
replace funding_type="FEDERAL" if agency_ab=="FEDERAL"
replace funding_type="PRIVATE-FORPROFIT" if agency_ab=="FORPROFIT"
replace funding_type="PRIVATE-NONPROFIT" if agency_ab=="NONPROFIT"
replace funding_type="FOREIGN" if agency_ab=="FOREIGN"
replace funding_type="STATE" if agency_ab=="LOCAL"
replace funding_type="UNIVERSITY" if agency_ab=="UNIVERSITY"
replace funding_type="PRIVATE" if agency_ab=="INVEST"|agency_ab=="HOSPITAL"

gen funding_type_source="CFDA" if funding_type!=""

gen temp_agency=substr(unique,1,2)
destring temp_agency, replace force
replace funding_type="FEDERAL" if funding_type==""&temp_agency!=.&substr(unique,3,1)=="."/*
*/&substr(unique,1,6)!="00.000"&temp_agency>9&temp_agency<99&temp_agency!=89
replace cfda=substr(unique,1,6) if cfda==""&funding_type=="FEDERAL"&substr(unique,7,1)==" "
replace cfda=substr(unique,1,5) if cfda==""&funding_type=="FEDERAL"&substr(unique,6,1)==" "
replace cfda=substr(unique,1,4) if cfda==""&funding_type=="FEDERAL"&substr(unique,5,1)==" "
replace funding_type="PRIVATE-NONPROFIT" if cfda==""&funding_type_source!="CFDA"&funding_type=="FEDERAL"&temp_agency==31
replace funding_type_source="CFDA" if funding_type!=""

keep if funding_source!=""|funding_type!=""

replace funding_source=upper(funding_source)
do "N:\GrpHOBAHE\dofile\identify_funding_type_from_source_7_1.do" 

drop temp*
drop funding_source
bysort new_emp submit_u period_start period_end unique: gen count=_N
drop if count>1
drop count
compress
save "$data/employee_cleaned.dta", replace


****************************************
*******II. CONSTRUCT REGRESSION DATASET************
****************************************

*get funding amount by award by year*

use "$rawdata\core_award2019q1.dta", clear
ren institution_id submit_university
drop if recipient==""&(unique_award==""|unique_award=="#N/A")
replace overhead=-overhead if overhead<0
replace total_direct=-total_direct if total_direct<0
collapse (sum) overhead total_direct, by (period_start period_end unique_award submit_u)
drop if unique==""
save "$temp\temp_award_amount_period.dta", replace

*****************************************************
*construct employee level shock for multiple CFDA employees*
*****************************************************

use "$rawdata\core_employee2019q1.dta", clear
ren emp_num new_emp
ren institution_id submit_university
merge m:1 cfda using "$singleaudit\cfda_all", keep(3) nogen keepus(cfda)
egen cfda_num=group(cfda)
bysort new_emp submit_u: egen ncfda=nvals(cfda_num)
drop if proportion<=0&ncfda>1
cap drop ncfda
bysort new_emp submit_u: egen ncfda=nvals(cfda_num)
drop if proportion==.&ncfda>1
collapse (max) proportion (mean) cfda_num, by (new_emp submit_u unique period_start period_end cfda)
replace proportion=1 if proportion>1&proportion<.
collapse (sum) proportion (mean) cfda_num, by (new_emp submit_u cfda)
bysort new_emp submit_u: egen temp_total=sum(proportion)
replace proportion=proportion/temp_total
bysort new_emp submit_u: egen ncfda=nvals(cfda_num)
replace proportion=1 if ncfda==1
bysort new_emp submit_u: egen temp_total2=sum(proportion)
drop if proportion==.&ncfda>1
assert temp_total2>0.99999&temp_total2<=1
drop temp*

joinby cfda using "$singleaudit\singleaudit_cfda"
foreach var in amount amount_rd amount_nonrd {
replace `var'=proportion*log(`var')
}
collapse (sum) amount* (mean) ncfda, by(new_emp submit_u year)

compress
egen emp_num=group(new_emp)
tsset emp_num year
gen d_amountrd=amount_rd-L.amount_rd
gen d_amountrd_lag1=L.amount_rd-L2.amount_rd
gen d_amountrd_lag2=L2.amount_rd-L3.amount_rd
gen d_amountrd_lag3=L3.amount_rd-L4.amount_rd
gen d_amountrd_lag4=L4.amount_rd-L5.amount_rd
drop emp_num
save "$temp\shock_multicfda_employee", replace

use "$rawdata\core_employee2019q1.dta", clear
gen byte faculty=(umetrics_occ=="Faculty"|umetrics_occ=="faculty")
ren emp_num new_emp
ren institution_id submit_university
merge m:1 unique submit using "$temp\award_field", keep(1 3) nogen

collapse (max) fte proportion faculty (first) umetrics_occ sub_org, by (new_emp submit_u unique period_start period_end cfda)
merge 1:1 new_emp submit_university unique period_start period_end using "$data\employee_cleaned"/*
*/, keep(1 3) keepus(funding_type) nogen

bysort period_start period_end unique submit_u: egen total_fte=sum(fte)

merge m:1 period_start period_end unique_award submit_university using "$temp\temp_award_amount_period", nogen keep(1 3)

replace overhead=overhead/total_fte*fte
replace total_direct=total_direct/total_fte*fte
gen overhead_federal=overhead if funding_type=="FEDERAL"
gen total_direct_federal=total_direct if funding_type=="FEDERAL"
gen overhead_private=overhead_ch if substr(funding_type,1,3)=="PRI"
gen total_direct_private=total_direct_ex if substr(funding_type,1,3)=="PRI"
gen total_direct_forprofit=total_direct_ex if funding_type=="PRIVATE-FORPROFIT"
gen total_direct_nonprofit=total_direct_ex if funding_type=="PRIVATE-NONPROFIT"
gen total_direct_state=total_direct_ex if funding_type=="STATE"
gen total_direct_uni=total_direct_ex if funding_type=="UNIVERSITY"
gen year_end=substr(period_end,1,4)
destring year_end, replace

preserve
collapse (sum) total_direct_expenditures, by (new_emp submit_u sub_org)
drop if sub_org==""
gsort new_emp -total_direct_expenditures
collapse (first) sub_org submit_u, by(new_emp)
save "$temp\employee_field", replace
restore

collapse (sum) overhead* total_direct* (max) faculty (first) umetrics_occ, by (new_emp submit_u year_end)

*create balanced panel
drop if year==1900
egen emp_num=group(new_emp)
tsset emp_num year
tsfill, full

preserve
gsort emp_num -new_emp
collapse (first) submit_u new_emp faculty umetrics_occ, by(emp_num)
save "$temp\emp_num_crosswalk", replace
restore

drop submit_u new_emp faculty umetrics_occ
merge m:1 emp_num using "$temp\emp_num_crosswalk", nogen

foreach var in overhead_c overhead_f overhead_p total_direct_e total_direct_fe total_direct_p total_direct_fo total_direct_n total_direct_uni total_direct_state {
replace `var'=0 if `var'==.
}

ren year_end year
merge 1:1 submit_u new_emp year using "$temp\shock_multicfda_employee", keep(1 3) nogen

tsset emp_num year
merge m:1 new_emp using "$temp\employee_field", keep(1 3) nogen
ren submit_u institution_id
merge m:1 institution_id sub_org_unit using "$data\sub_org_singlecfda_fields_classified_new", keep(1 3) keepus(field) nogen
replace field="other" if field==""
ren institution_id submit_university
ren new_emp iris_employee_number
save "$data\final_multicfda.dta", replace


****************************************
*******III. COMBINE PATENT OR JOB HISTORY DATA************
****************************************

use "$data\final_multicfda", clear
ren iris emp_num_2019
merge m:1 emp_num_2019 using "$rawdata\emp_num_xwalk", keep(1 3) nogen
ren emp_num_2018 iris_employee_number
merge m:1 iris year using "$data\inventor_umetrics_byyear", keep(1 3) nogen
foreach var in npatent ncited scaled1 scaled2 npatent_weight npatent_higho npatent_highg npatent_posc npatent_highc npatent_priv npatent_artq4 npatent_jif npatent_totjif npatent_totjif2 npatent_totjif3 npatent_zerocite npatent_quartile1 npatent_quartile2 npatent_quartile3 npatent_quartile4 {
replace `var'=0 if `var'==.
}
drop iris
ren emp_num_2019 iris_employee_number
compress
save "$data\final_multicfda", replace


****************************************
*******IV. REGRESSION************
****************************************

use "$data\final_multicfda", clear

egen fieldid=group(field)
egen uni_suborg=group(submit sub_org)
gen log_expenditure=log(total_direct_ex)
ren total_direct_ex total_direct_expenditure
tsset emp_num year

foreach soc in federal private forprofit {
gen share_`soc'=total_direct_`soc'/total_direct_expenditure
gen share_`soc'_3year=(total_direct_`soc'+L.total_direct_`soc'+L2.total_direct_`soc')/ /*
*/(total_direct_expenditure+L.total_direct_expenditure+L2.total_direct_expenditure)
gen share_`soc'_2year=(total_direct_`soc'+L.total_direct_`soc')/ /*
*/(total_direct_expenditure+L.total_direct_expenditure)
replace share_`soc'_2year=share_`soc' if share_`soc'_2year==.
replace share_`soc'_3year=share_`soc'_2year if share_`soc'_3year==.
}

*drop universities with no private funding
bysort submit year: egen max_shprivate=max(share_private)

gen byte private=(share_private>=0.5)
replace private=. if share_private==.
gen byte forprofit=(share_forprofit>=0.5)
replace forprofit=. if share_forprofit==.
gen byte patent=(npatent>0)
replace ncited=0 if ncited==.&patent==1

replace umetrics_occ="Faculty" if umetrics_occ=="faculty" 
replace umetrics_occ="Graduate Student" if umetrics_occ=="Graduate Student " 
replace umetrics_occ="Graduate Student" if umetrics_occ=="Post Graduate Research"
replace umetrics_occ="Graduate Student" if umetrics_occ=="Research"
replace umetrics_occ="Undergraduate" if umetrics_occ=="Undergraduate " 
replace umetrics_occ="Staff" if umetrics_occ!="Faculty"&umetrics_occ!="Undergraduate"&umetrics_occ!="Graduate Student" 
tab umetrics_occ 

gen share_other=1-share_federal-share_private

gen include=(max>0&year<=2016)

compress
save "$data/regression_sample", replace


